R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ## Import all the libraries

importlib <- c("ggplot2", "stringr", "magrittr", "futile.logger", "VennDiagram", "tm", "SnowballC", "wordcloud", "RColorBrewer", "lattice", "caret", "rpart", "rpart.plot", "randomForest", "e1071", "ROCR", "gmodels", "mime", "plotly")

require(importlib)
## Loading required package: importlib
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'importlib'
lapply(importlib, require, character.only = TRUE)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
## 
## [[8]]
## [1] TRUE
## 
## [[9]]
## [1] TRUE
## 
## [[10]]
## [1] TRUE
## 
## [[11]]
## [1] TRUE
## 
## [[12]]
## [1] TRUE
## 
## [[13]]
## [1] TRUE
## 
## [[14]]
## [1] TRUE
## 
## [[15]]
## [1] TRUE
## 
## [[16]]
## [1] TRUE
## 
## [[17]]
## [1] TRUE
## 
## [[18]]
## [1] TRUE
## 
## [[19]]
## [1] TRUE

Download the dataset.

Spam_SMS <- read.csv("./SMS_Spam_Dataset.csv", stringsAsFactors = F)
str(Spam_SMS)
## 'data.frame':    5572 obs. of  5 variables:
##  $ v1 : chr  "ham" "ham" "spam" "ham" ...
##  $ v2 : chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
##  $ X  : chr  "" "" "" "" ...
##  $ X.1: chr  "" "" "" "" ...
##  $ X.2: chr  "" "" "" "" ...

Clean the data.

# Remove Null Columns.
Spam_SMS$X <- NULL
Spam_SMS$X.1 <- NULL
Spam_SMS$X.2 <- NULL

# Assign appropriate names to the columns.
names(Spam_SMS) <- c("MessageLabel","Message")

# Check if any other NULL values exist in the dataset.
colSums(is.na(Spam_SMS))
## MessageLabel      Message 
##            0            0
# Convert class into factor.
levels(as.factor(Spam_SMS$MessageLabel))
## [1] "ham"  "spam"
# Assign appropriate names to the data entries under Column "Message_Label"
Spam_SMS$MessageLabel[Spam_SMS$MessageLabel == "ham"] <- "Legitimate"
Spam_SMS$MessageLabel[Spam_SMS$MessageLabel == "spam"] <- "Spam"

# Convert class into factor.
Spam_SMS$MessageLabel <- factor(Spam_SMS$MessageLabel)

Explore the data

Explore the distribution of Spam and Legitimate Messages.

# Produce a data frame displaying the total number of legitmate messages and spam messages.
Distribution <- as.data.frame(table(Spam_SMS$MessageLabel))

# Calculate percentage for each type of Message Label. 
Distribution$Percentage <- (Distribution$Freq/nrow(Spam_SMS))*100
Distribution$Percentage <- round(Distribution$Percentage, digits = 2)
names(Distribution) <- c("Label", "Total", "Percentage")

# Plot the Distribution using plotly.
attach(Distribution)
## The following objects are masked from Distribution (pos = 3):
## 
##     Label, Percentage, Total
List <- list(
     zeroline=FALSE,
     showline=FALSE,
     showticklabels=FALSE,
     showgrid=FALSE
 )

plot_ly(Distribution, labels=Label, values = Percentage, type="pie", hole=0.2, showlegend = T) %>% layout(title = "Distribution of Spam Messages v/s Legitimate Messages", xaxis=List, yaxis=List, showlegend = TRUE)

This plot reveals that 86% of all the SMS messages in the dataset are Legitimate messages, while 13% of them are Spam messages.

To know the length of each text so as to be able to explore the data more.

# Count the number of characters in each Message.
Spam_SMS$MessageLength <- nchar(Spam_SMS$Message)

# Find the maximum length of Legitimate Message.
max(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Legitimate"])
## [1] 910
# Find the maximum length of Spam Message.
max(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Spam"])
## [1] 224
# Find the minimum length of Legitimate Message.
min(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Legitimate"])
## [1] 2
# Find the minimum length of Spam Message.
min(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Spam"])
## [1] 13

Plot the distribution of Legitimate and Spam messages v/s the Message Length.

ggplot(Spam_SMS, aes(x = MessageLength, fill = MessageLabel)) +
  theme_bw() +
  geom_histogram(binwidth = 5) +
  labs(y = "Number of Messages", x = "Length of Message",
       title = "Distribution of Message Lengths with Class Labels")

This plot helps us understand the following: 1. The length of legitimate messages ranges from 2 characters to 910 characters. 2. The length of spam messages ranges from 13 charcters to 224 characters. 3. The most common length of legitimate messages is 22 characters. 4. The most common length of spam messages is 158 characters.

Split Raw SMS Data on Labels (Spam and Legitmate) and produce wordclouds for each. Using Wordcloud would help understand frequent words. More frequent the word, larger the font will be for it. Producing wordclouds would give a better understanding of all the features that differentiate Spam SMSs from Legitimate SMSs.

# Splitting Raw SMS Data on Labels (Spam and Legitmate). 
Spam_Raw <- subset(Spam_SMS, MessageLabel == "Spam")
Legitimate_Raw <- subset(Spam_SMS, MessageLabel == "Legitimate")

# Produce wordcloud for Spam_Raw
pal = brewer.pal(6,"Dark2")
wordcloud(Spam_Raw$Message, max.words = 30, scale=c(6, .3), colors = pal)

The wordcloud reveals that the most frequent words in Spam messages are: Call, Free, Now, Mobile, Text and Prize.

# Produce wordcloud for Legitimate_Raw
wordcloud(Legitimate_Raw$Message, max.words = 30, scale=c(4, .3), colors = pal)

The wordcloud reveals that the most frequent words in legitimate messages are: Can, Will, Now, Just, etc.

To convert all the tokens to lower case. Post that, run for loops for words manually selected as differentiating features for Spam SMSs, and for words revealed frequent by the above wordcloud produced for spam messages. This would be followed by correct assignment of ‘y’ or ‘n’ for each message in the dataset. (‘y’ corresponds to availability of that word in a particular SMS while ‘n’ corresponds to non-availability of that word in the SMS)

# Transformation of all tokens to lower case.
Spam_SMS$Message %<>% str_to_lower()

# For loop for token 'free'
Spam_SMS$free <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "free")  == TRUE){
    Spam_SMS$free[i] <- "y"
  }
}

# For loop for token 'winner, win, won, award, selected, prize and claim'
Spam_SMS$winner <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "winner")  == TRUE){
    Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "win")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "won")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "award")  == TRUE){
   Spam_SMS$winner[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "selected")  == TRUE){
   Spam_SMS$winner[i] <- "y"
    }
  if(str_detect(Spam_SMS$Message[i], "prize")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "claim")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
}

# For loop for token 'congratulations, congrats'
Spam_SMS$congratulation <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "congrats")  == TRUE){
    Spam_SMS$congratulation[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "congratulations")  == TRUE){
    Spam_SMS$congratulation[i] <- "y"
  }
}

# For loop for token 'xxx, babe, naked, dirty, flirty'
Spam_SMS$adult <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "xxx")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "babe")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "naked")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
    if(str_detect(Spam_SMS$Message[i], "dirty")  == TRUE){
    Spam_SMS$adult[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "flirty")  == TRUE){
    Spam_SMS$adult[i] <- "y"
    }
}

# For loop for token 'urgent, attention, bonus, immediately, now, stop'
Spam_SMS$attention <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "urgent")  == TRUE){
    Spam_SMS$attention[i] <- "y"
  }
    if(str_detect(Spam_SMS$Message[i], "attention")  == TRUE){
    Spam_SMS$attention[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "bonus")  == TRUE){
    Spam_SMS$attention[i] <- "y"
      }
    if(str_detect(Spam_SMS$Message[i], "immediately")  == TRUE){
    Spam_SMS$attention[i] <- "y"
    }
  if(str_detect(Spam_SMS$Message[i], "now")  == TRUE){
   Spam_SMS$attention[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "stop")  == TRUE){
   Spam_SMS$attention[i] <- "y"
  }
}

# For loop for token 'ringtone, call, mobile, text, txt'
Spam_SMS$ringtone  <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "ringtone")  == TRUE){
    Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "call")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "mobile")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "text")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "txt")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
}

After having this chunk run, there are 6 more columns added to the dataset (Spam_SMS) with values = y or n, depending on the availability of the keywords in messages.

Plot bar graph depicting total number of messages with the value of these features being equal to “y”.

#For Unigrams

# Produce a data frame 'Spam_Features' containing Features and the total number of messages containing that feature.
Spam_Features <- data.frame(Features = c("Free", "Adult", "Ringtone", "Congratulation", "Winner", "Attention"), Total = c(sum(Spam_SMS$free == "y"), sum(Spam_SMS$adult == "y"), sum(Spam_SMS$ringtone == "y"), sum(Spam_SMS$congratulation == "y"), sum(Spam_SMS$winner == "y"), sum(Spam_SMS$attention == "y")))

# Plot the data frame.
ggplot(Spam_Features, aes(x = reorder(Features, -Total), y = Total)) + geom_bar(stat = "identity", fill = "steelblue") + geom_text(aes(label = Total), color = "red", vjust = 0) + xlab("Features")+ ylab("Total Number of Messages")

The plot reveals that the most frequently used keywords fall under the categories: Ringtone, Attention and Winner, while the least frequently used keywords fall under the categories: Congratulations, Adult and Free.

Produce Venn Diagram to analyse how many SMS messages have bigrams’ feature combination and trigrams’ feature combinations.

For bigrams

# Compute the number of SMS messages having combination of two and/or three features. After having obtained these values, Venn Diagrams would be produced for these combinations.

#For Free and Adult
Free_Adult <- sum(Spam_SMS$free == "y" & Spam_SMS$adult == "y")
Free_Adult
## [1] 9
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 265, area2 = 150, cross.area = 9, category = c("Free", 
    "Adult"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.556], polygon[GRID.polygon.557], polygon[GRID.polygon.558], polygon[GRID.polygon.559], text[GRID.text.560], text[GRID.text.561], text[GRID.text.562], lines[GRID.lines.563], text[GRID.text.564], text[GRID.text.565])
#For Free and Ringtone
Free_Ringtone <- sum(Spam_SMS$free == "y" & Spam_SMS$ringtone == "y")
Free_Ringtone
## [1] 193
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 265, area2 = 994, cross.area = 193, category = c("Free", 
    "Ringtone"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.566], polygon[GRID.polygon.567], polygon[GRID.polygon.568], polygon[GRID.polygon.569], text[GRID.text.570], text[GRID.text.571], text[GRID.text.572], text[GRID.text.573], text[GRID.text.574])
#For Free and Congratulation
Free_Congratulation <- sum(Spam_SMS$free == "y" & Spam_SMS$congratulation == "y")
Free_Congratulation
## [1] 9
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 265, area2 = 34, cross.area = 9, category = c("Free", 
    "Congratulation"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.575], polygon[GRID.polygon.576], polygon[GRID.polygon.577], polygon[GRID.polygon.578], text[GRID.text.579], text[GRID.text.580], text[GRID.text.581], lines[GRID.lines.582], text[GRID.text.583], text[GRID.text.584])
#For Free and Winner
Free_Winner <- sum(Spam_SMS$free == "y" & Spam_SMS$winner == "y")
Free_Winner
## [1] 52
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 265, area2 = 419, cross.area = 52, category = c("Free", 
    "Winner"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.585], polygon[GRID.polygon.586], polygon[GRID.polygon.587], polygon[GRID.polygon.588], text[GRID.text.589], text[GRID.text.590], text[GRID.text.591], text[GRID.text.592], text[GRID.text.593])
#For Free and Attention
Free_Attention <- sum(Spam_SMS$free == "y" & Spam_SMS$attention == "y")
Free_Attention
## [1] 104
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 265, area2 = 928, cross.area = 104, category = c("Free", 
    "Attention"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.594], polygon[GRID.polygon.595], polygon[GRID.polygon.596], polygon[GRID.polygon.597], text[GRID.text.598], text[GRID.text.599], text[GRID.text.600], text[GRID.text.601], text[GRID.text.602])
#For Adult and Winner
Adult_Winner <- sum(Spam_SMS$adult == "y" & Spam_SMS$winner == "y")
Adult_Winner
## [1] 9
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 150, area2 = 419, cross.area = 9, category = c("Adult", 
    "Winner"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.603], polygon[GRID.polygon.604], polygon[GRID.polygon.605], polygon[GRID.polygon.606], text[GRID.text.607], text[GRID.text.608], text[GRID.text.609], lines[GRID.lines.610], text[GRID.text.611], text[GRID.text.612])
#For Adult and Attention
Adult_Attention <- sum(Spam_SMS$adult == "y" & Spam_SMS$attention == "y")
Adult_Attention
## [1] 29
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 150, area2 = 928, cross.area = 29, category = c("Adult", 
    "Attention"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.613], polygon[GRID.polygon.614], polygon[GRID.polygon.615], polygon[GRID.polygon.616], text[GRID.text.617], text[GRID.text.618], text[GRID.text.619], lines[GRID.lines.620], text[GRID.text.621], text[GRID.text.622])
#For Congratulation and Winner
congratulation_Winner <- sum(Spam_SMS$congratulation == "y" & Spam_SMS$winner == "y")
congratulation_Winner
## [1] 14
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 34, area2 = 419, cross.area = 14, category = c("Congratulation", 
    "Winner"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.623], polygon[GRID.polygon.624], polygon[GRID.polygon.625], polygon[GRID.polygon.626], text[GRID.text.627], text[GRID.text.628], lines[GRID.lines.629], text[GRID.text.630], lines[GRID.lines.631], text[GRID.text.632], text[GRID.text.633])
#For Attention and Winner
Attention_Winner <- sum(Spam_SMS$attention == "y" & Spam_SMS$winner == "y")
Attention_Winner
## [1] 161
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 928, area2 = 419, cross.area = 161, category = c("Attention", 
    "Winner"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.634], polygon[GRID.polygon.635], polygon[GRID.polygon.636], polygon[GRID.polygon.637], text[GRID.text.638], text[GRID.text.639], text[GRID.text.640], text[GRID.text.641], text[GRID.text.642])
#For Ringtone and Winner
Ringtone_Winner <- sum(Spam_SMS$ringtone == "y" & Spam_SMS$winner == "y")
Ringtone_Winner
## [1] 235
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 994, area2 = 419, cross.area = 235, category = c("Ringtone", 
    "Winner"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.643], polygon[GRID.polygon.644], polygon[GRID.polygon.645], polygon[GRID.polygon.646], text[GRID.text.647], text[GRID.text.648], text[GRID.text.649], text[GRID.text.650], text[GRID.text.651])
#For Ringtone and Congratulation
Ringtone_Congratulation <- sum(Spam_SMS$ringtone == "y" & Spam_SMS$congratulation == "y")
Ringtone_Congratulation
## [1] 23
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 994, area2 = 34, cross.area = 23, category = c("Ringtone", 
    "Congratulation"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.652], polygon[GRID.polygon.653], polygon[GRID.polygon.654], polygon[GRID.polygon.655], text[GRID.text.656], text[GRID.text.657], lines[GRID.lines.658], text[GRID.text.659], lines[GRID.lines.660], text[GRID.text.661], text[GRID.text.662])
#For Attention and Congratulation
Attention_Congratulation <- sum(Spam_SMS$attention == "y" & Spam_SMS$congratulation == "y")
Attention_Congratulation
## [1] 15
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 928, area2 = 34, cross.area = 15, category = c("Attention", 
    "Congratulation"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.663], polygon[GRID.polygon.664], polygon[GRID.polygon.665], polygon[GRID.polygon.666], text[GRID.text.667], text[GRID.text.668], lines[GRID.lines.669], text[GRID.text.670], lines[GRID.lines.671], text[GRID.text.672], text[GRID.text.673])
#For Attention and Ringtone
Attention_Ringtone <- sum(Spam_SMS$attention == "y" & Spam_SMS$ringtone == "y")
Attention_Ringtone
## [1] 368
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 928, area2 = 994, cross.area = 368, category = c("Attention", 
    "Ringtone"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.674], polygon[GRID.polygon.675], polygon[GRID.polygon.676], polygon[GRID.polygon.677], text[GRID.text.678], text[GRID.text.679], text[GRID.text.680], text[GRID.text.681], text[GRID.text.682])
#For Adult and Ringtone
Adult_Ringtone <- sum(Spam_SMS$adult == "y" & Spam_SMS$ringtone == "y")
Adult_Ringtone
## [1] 39
# Venn Diagram for the bigram
grid.newpage()
draw.pairwise.venn(area1 = 150, area2 = 994, cross.area = 39, category = c("Adult", 
    "Ringtone"), lty = rep("blank", 
    2), fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0, 
    0), cat.dist = rep(0.025, 2))

## (polygon[GRID.polygon.683], polygon[GRID.polygon.684], polygon[GRID.polygon.685], polygon[GRID.polygon.686], text[GRID.text.687], text[GRID.text.688], text[GRID.text.689], lines[GRID.lines.690], text[GRID.text.691], text[GRID.text.692])

For trigrams

#For free, congratulation and winner 
Free_Congratulation_Winner <- sum(Spam_SMS$free == "y" & Spam_SMS$congratulation == "y" & Spam_SMS$winner == "y")
Free_Congratulation_Winner
## [1] 6
# Venn Diagram for the trigram
grid.newpage()
draw.triple.venn(area1 = 265, area2 = 34, area3 = 419, n12 = 9, n23 = 14, n13 = 52, 
    n123 = 6, category = c("Free", "Congratulation", "Winner"), lty = "blank", 
    fill = c("skyblue", "pink1", "mediumorchid"))

## (polygon[GRID.polygon.693], polygon[GRID.polygon.694], polygon[GRID.polygon.695], polygon[GRID.polygon.696], polygon[GRID.polygon.697], polygon[GRID.polygon.698], text[GRID.text.699], text[GRID.text.700], text[GRID.text.701], text[GRID.text.702], text[GRID.text.703], text[GRID.text.704], text[GRID.text.705], text[GRID.text.706], text[GRID.text.707], text[GRID.text.708])
#For free, attention and winner 
Free_Attention_Winner <- sum(Spam_SMS$free == "y" & Spam_SMS$attention == "y" & Spam_SMS$winner == "y")
Free_Attention_Winner
## [1] 26
# Venn Diagram for the trigram
grid.newpage()
draw.triple.venn(area1 = 265, area2 = 928, area3 = 419, n12 = 104, n23 = 161, n13 = 52, 
    n123 = 2, category = c("Free", "Attention", "Winner"), lty = "blank", 
    fill = c("skyblue", "pink1", "mediumorchid"))

## (polygon[GRID.polygon.709], polygon[GRID.polygon.710], polygon[GRID.polygon.711], polygon[GRID.polygon.712], polygon[GRID.polygon.713], polygon[GRID.polygon.714], text[GRID.text.715], text[GRID.text.716], text[GRID.text.717], text[GRID.text.718], text[GRID.text.719], text[GRID.text.720], text[GRID.text.721], text[GRID.text.722], text[GRID.text.723], text[GRID.text.724])
#For adult, attention and winner 
Adult_Attention_Winner <- sum(Spam_SMS$adult == "y" & Spam_SMS$attention == "y" & Spam_SMS$winner == "y")
Adult_Attention_Winner
## [1] 3
# Venn Diagram for the trigram
grid.newpage()
draw.triple.venn(area1 = 150, area2 = 928, area3 = 419, n12 = 29, n23 = 161, n13 = 9, 
    n123 = 3, category = c("Adult", "Attention", "Winner"), lty = "blank", 
    fill = c("skyblue", "pink1", "mediumorchid"))

## (polygon[GRID.polygon.725], polygon[GRID.polygon.726], polygon[GRID.polygon.727], polygon[GRID.polygon.728], polygon[GRID.polygon.729], polygon[GRID.polygon.730], text[GRID.text.731], text[GRID.text.732], text[GRID.text.733], text[GRID.text.734], text[GRID.text.735], text[GRID.text.736], text[GRID.text.737], text[GRID.text.738], text[GRID.text.739], text[GRID.text.740])

Text Analysis

To make the data ready for text analysis. In this, we use text-mining package (package tm) to manage the documents.

# create a Corpus of Messages in Spam_SMS. 
BagOfWords <- Corpus(VectorSource(Spam_SMS$Message))

# Clean corpus.
Clean_BagOfWords <- BagOfWords %>%
                    tm_map(content_transformer(tolower)) %>% # Transofrm to lower case
                    tm_map(removeNumbers) %>%                # Clean by removing numbers
                    tm_map(removeWords, stopwords(kind="en")) %>% # Clean by removing stopwords
                    tm_map(removePunctuation) %>%            # Clean by removing punctuation
                    tm_map(stripWhitespace)                  # Clean by tokenising by striping white space

# Transform corpus into matrix.
TDM = DocumentTermMatrix(Clean_BagOfWords)

SparseWords <- removeSparseTerms(TDM, 0.995)

# Transform the matrix of Sparsewords into data frame.
SparseWords <- as.data.frame(as.matrix(SparseWords))

# Rename column names.
colnames(SparseWords) <- make.names(colnames(SparseWords))

str(SparseWords)
## 'data.frame':    5572 obs. of  290 variables:
##  $ got       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ great     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ wat       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ world     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ lar       : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ apply     : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ free      : num  0 0 1 0 0 0 0 0 0 2 ...
##  $ may       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ receive   : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ text      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ txt       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ win       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ already   : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ dun       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ early     : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ say       : num  0 0 0 2 0 0 0 0 0 0 ...
##  $ around    : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ think     : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ back      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ fun       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ hey       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ like      : num  0 0 0 0 0 1 2 0 0 0 ...
##  $ now       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ send      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ still     : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ word      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ xxx       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ even      : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ speak     : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ friends   : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ per       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ call      : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ claim     : num  0 0 0 0 0 0 0 0 2 0 ...
##  $ code      : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ customer  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ network   : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ prize     : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ selected  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ camera    : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ latest    : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ mobile    : num  0 0 0 0 0 0 0 0 0 2 ...
##  $ enough    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gonna     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ home      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ soon      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stuff     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ talk      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ today     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ tonight   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ want      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cash      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cost      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ days      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ reply     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pobox     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ urgent    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ week      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ won       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ help      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ right     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ take      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thank     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ will      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ wont      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ message   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ next.     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ use       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ watching  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ make      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ name      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ remember  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ yes       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ feel      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fine      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ way       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dont      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ miss      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ going     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ try       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ first     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ finish    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lor       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lunch     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ can       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ meet      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ eat       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ getting   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ just      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lol       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ really    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ always    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dinner    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ left      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ love      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amp       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ car       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ know      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ let       : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
SparseWords$MessageLabel <- Spam_SMS$MessageLabel

Classification Process to accurately classify SMS messages into Spam messages or Legitimate messages.

Splitting the data in a ratio of 7:3: 70% to build the predictive model and 30% to test the model. I am splitting the dataset, Sparsewords, Corpus(BagOfWords) and the Term Document Matrix.

# Random number generation using set.seed of 1234.
set.seed(1234)

# Create a split formula using which I would split the data into train and test sets.
Split_Formula <- createDataPartition(Spam_SMS$MessageLabel, p=0.7, list=FALSE)

# Split Spam_SMS into training and test sets.
train_data <- Spam_SMS[Split_Formula,]
test_data <- Spam_SMS[-Split_Formula,]

# Split SparseWords into training and test sets.
Sparse_train_data <- SparseWords[Split_Formula,]
Sparse_test_data <- SparseWords[-Split_Formula,]

# Split corpus into training and test data.
Corpus_train_data <- Clean_BagOfWords[Split_Formula]
Corpus_test_data <- Clean_BagOfWords[-Split_Formula]

# Split Term Document Matrix into training and test data.
TDM_train_data <- TDM[Split_Formula,]
TDM_test_data <- TDM[-Split_Formula,]

Producing Wordcloud of the cleaned Corpus for analysis.

wordcloud(Clean_BagOfWords, max.words = 75, random.order = FALSE, scale=c(5, .3), colors = pal)

The wordcloud reveals that the most frequent words in Clean Corpus(mix of Legitimate and Spam messages) are: Call, Can, Now, Get, Just, Will, Free, etc. Therefore, it is evident that this wordcloud substantiates the two wordclouds produced above (each for spam an legitimate messages) as this wordcloud has a mix of the frequent words shown in those wordclouds (like: Free, Call, Can, Just)

Split train_data on Labels (Spam and Legitmate) and produce wordclouds for each. Using Wordcloud would help understand frequent words. More frequent the word, larger the font will be for it.

# Splitting train_data on Labels (Spam and Legitmate).
Spam <- subset(train_data, MessageLabel == "Spam")
Legitimate <- subset(train_data, MessageLabel == "Legitimate")

# Produce wordcloud for Spam
wordcloud(Spam$Message, max.words = 30, scale=c(7, .3), colors = pal)

The wordcloud reveals that the most frequent words in Spam messages for train data are: Call, Free, Now, Claim. Text, etc. They are the same as the ones displayed in the wordcloud for Spam messages in Spam_SMS dataset. Hence, this shows that the data has been correctly splitted into trainng and test sets.

# Produce wordcloud for Legitimate.
wordcloud(Legitimate$Message, max.words = 30, scale=c(5, .3), colors = pal)

The wordcloud reveals that the most frequent words in Legitimate messages for train data are: Will, Can, Now, Just, etc. they are the same as the ones displayed in the wordcloud for Legitimate messages in Spam_SMS dataset. Hence, this shows that the data has been correctly splitted into trainng and test sets.

Building models based on the manually selected 6 features of Spam SMS.

Decision Tree Model

# Build a recursive partitioning decision tree.

SMS_Rpart <- rpart(formula = MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, method = "class")

rpart.plot(SMS_Rpart, type = 4, fallen.leaves = FALSE, extra = 4)

This tree reveals that out of all these tokens, the most important token is ‘ringtone’ and the least important ones being ‘congratulation and adult’.

summary(SMS_Rpart)
## Call:
## rpart(formula = MessageLabel ~ free + winner + congratulation + 
##     adult + attention + ringtone, data = train_data, method = "class")
##   n= 3901 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.28871893      0 1.0000000 1.0000000 0.04069031
## 2 0.08221797      1 0.7112811 0.7112811 0.03507580
## 3 0.06883365      3 0.5468451 0.6558317 0.03381897
## 4 0.01000000      4 0.4780115 0.4780115 0.02924733
## 
## Variable importance
##       ringtone           free         winner      attention congratulation 
##             68             13             11              6              1 
## 
## Node number 1: 3901 observations,    complexity param=0.2887189
##   predicted class=Legitimate  expected loss=0.1340682  P(node) =1
##     class counts:  3378   523
##    probabilities: 0.866 0.134 
##   left son=2 (3204 obs) right son=3 (697 obs)
##   Primary splits:
##       ringtone       splits as  LR, improve=381.73920, (0 missing)
##       winner         splits as  LR, improve=152.27790, (0 missing)
##       free           splits as  LR, improve=135.13340, (0 missing)
##       attention      splits as  LR, improve=102.71100, (0 missing)
##       congratulation splits as  LR, improve= 14.21593, (0 missing)
##   Surrogate splits:
##       free           splits as  LR, agree=0.843, adj=0.123, (0 split)
##       winner         splits as  LR, agree=0.830, adj=0.049, (0 split)
##       congratulation splits as  LR, agree=0.823, adj=0.010, (0 split)
## 
## Node number 2: 3204 observations
##   predicted class=Legitimate  expected loss=0.03089888  P(node) =0.8213279
##     class counts:  3105    99
##    probabilities: 0.969 0.031 
## 
## Node number 3: 697 observations,    complexity param=0.08221797
##   predicted class=Spam        expected loss=0.3916786  P(node) =0.1786721
##     class counts:   273   424
##    probabilities: 0.392 0.608 
##   left son=6 (532 obs) right son=7 (165 obs)
##   Primary splits:
##       winner         splits as  LR, improve=43.9829100, (0 missing)
##       attention      splits as  LR, improve=37.9821300, (0 missing)
##       free           splits as  LR, improve=25.1773400, (0 missing)
##       congratulation splits as  LR, improve= 4.3835890, (0 missing)
##       adult          splits as  LR, improve= 0.8117561, (0 missing)
##   Surrogate splits:
##       congratulation splits as  LR, agree=0.766, adj=0.012, (0 split)
## 
## Node number 6: 532 observations,    complexity param=0.08221797
##   predicted class=Spam        expected loss=0.4906015  P(node) =0.1363753
##     class counts:   261   271
##    probabilities: 0.491 0.509 
##   left son=12 (352 obs) right son=13 (180 obs)
##   Primary splits:
##       attention splits as  LR, improve=36.011700, (0 missing)
##       free      splits as  LR, improve=35.200000, (0 missing)
##       adult     splits as  LR, improve= 1.243285, (0 missing)
##   Surrogate splits:
##       congratulation splits as  LR, agree=0.665, adj=0.011, (0 split)
## 
## Node number 7: 165 observations
##   predicted class=Spam        expected loss=0.07272727  P(node) =0.04229685
##     class counts:    12   153
##    probabilities: 0.073 0.927 
## 
## Node number 12: 352 observations,    complexity param=0.06883365
##   predicted class=Legitimate  expected loss=0.3778409  P(node) =0.09023327
##     class counts:   219   133
##    probabilities: 0.622 0.378 
##   left son=24 (294 obs) right son=25 (58 obs)
##   Primary splits:
##       free  splits as  LR, improve=25.979660, (0 missing)
##       adult splits as  LR, improve= 2.669721, (0 missing)
## 
## Node number 13: 180 observations
##   predicted class=Spam        expected loss=0.2333333  P(node) =0.04614201
##     class counts:    42   138
##    probabilities: 0.233 0.767 
## 
## Node number 24: 294 observations
##   predicted class=Legitimate  expected loss=0.292517  P(node) =0.07536529
##     class counts:   208    86
##    probabilities: 0.707 0.293 
## 
## Node number 25: 58 observations
##   predicted class=Spam        expected loss=0.1896552  P(node) =0.01486798
##     class counts:    11    47
##    probabilities: 0.190 0.810

Randome Forest Classifier

Apply Random Forest to substantiate analysis of Decision Tree by plotting the importance of each token.

train_data$MessageLabel %<>% as.factor()
train_data$Message  %<>% as.character()
train_data$free %<>% as.factor()
train_data$winner %<>% as.factor()
train_data$congratulation %<>% as.factor()
train_data$adult  %<>% as.factor()
train_data$attention   %<>% as.factor()
train_data$ringtone %<>% as.factor()

# Apply the formula for Random Forest Algorithm
SMS_RF <- MessageLabel ~ free + winner + congratulation + adult + attention + ringtone
RFSpam_Tree <- randomForest(SMS_RF, data = train_data, ntree=25, proximity = T)

# Plot the Variable Importance Plot.
ImportancePlot <- varImpPlot(RFSpam_Tree, main = "Importance of each Token") 

This plot salso expresses that the most important token amongst all is ‘Ringtone’, and the least important are ‘adult and congratulation’.

# Importance of each token in a tabular form.
importance(RFSpam_Tree)
##                MeanDecreaseGini
## free                  59.187556
## winner                90.519881
## congratulation         4.755318
## adult                  3.112153
## attention             42.090636
## ringtone             226.477607

Test the above Random Forest Model on test data and check the accuracy, precision, recall and F1.

test_data$MessageLabel %<>% as.factor()
test_data$Message  %<>% as.character()
test_data$free %<>% as.factor()
test_data$winner %<>% as.factor()
test_data$congratulation %<>% as.factor()
test_data$adult  %<>% as.factor()
test_data$attention   %<>% as.factor()
test_data$ringtone %<>% as.factor()

RFTest <- predict(RFSpam_Tree, newdata =test_data)

# Confusion Matrix
RF_Matrix <- confusionMatrix(predict(RFSpam_Tree, newdata =test_data), test_data$MessageLabel)
RF_Matrix
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1419   87
##   Spam               28  137
##                                          
##                Accuracy : 0.9312         
##                  95% CI : (0.918, 0.9428)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6664         
##  Mcnemar's Test P-Value : 6.354e-08      
##                                          
##             Sensitivity : 0.9806         
##             Specificity : 0.6116         
##          Pos Pred Value : 0.9422         
##          Neg Pred Value : 0.8303         
##              Prevalence : 0.8659         
##          Detection Rate : 0.8492         
##    Detection Prevalence : 0.9013         
##       Balanced Accuracy : 0.7961         
##                                          
##        'Positive' Class : Legitimate     
## 
# CrossTable
CrossTable(RFTest, test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | test_data$MessageLabel 
##       RFTest | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1419 |         87 |       1506 | 
##              |      0.942 |      0.058 |      0.901 | 
##              |      0.981 |      0.388 |            | 
##              |      0.849 |      0.052 |            | 
## -------------|------------|------------|------------|
##         Spam |         28 |        137 |        165 | 
##              |      0.170 |      0.830 |      0.099 | 
##              |      0.019 |      0.612 |            | 
##              |      0.017 |      0.082 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.94, while for predicting spam messages is 0.83. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.61. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is moderately high (0.17) as compared to the probability of a spam message being predicted as a legitimate message (0.06).

Accuracy for test data.

TestPredictability <- sum(RFTest == test_data$MessageLabel)/ length(test_data$MessageLabel)*100

message("Accuracy for Test Data is:")
## Accuracy for Test Data is:
print(TestPredictability)
## [1] 93.11789

Plot COnfusion Matrix

Reference_RF <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_RF <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y <- c(1419, 28, 87, 137)
ConfusionMatrixPlot_RF <- data.frame(Reference_RF, Prediction_RF, Y)

# Plot
ggplot(data =  ConfusionMatrixPlot_RF, mapping = aes(x = Reference_RF, y = Prediction_RF)) +
     geom_tile(aes(fill = Y), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Support Vector Machine

SMS_SVM <- svm(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, kernel = "linear", cost = 0.1, gamma = 0.1)
SVMTest <- predict(SMS_SVM, test_data)

# Confusion Matrix
SVM_Matrix <- confusionMatrix(predict(SMS_SVM, newdata = test_data), test_data$MessageLabel)
SVM_Matrix
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1413   79
##   Spam               34  145
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9193, 0.9439)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6817          
##  Mcnemar's Test P-Value : 3.486e-05       
##                                           
##             Sensitivity : 0.9765          
##             Specificity : 0.6473          
##          Pos Pred Value : 0.9471          
##          Neg Pred Value : 0.8101          
##              Prevalence : 0.8659          
##          Detection Rate : 0.8456          
##    Detection Prevalence : 0.8929          
##       Balanced Accuracy : 0.8119          
##                                           
##        'Positive' Class : Legitimate      
## 
# CrossTable
CrossTable(SVMTest, test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | test_data$MessageLabel 
##      SVMTest | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1413 |         79 |       1492 | 
##              |      0.947 |      0.053 |      0.893 | 
##              |      0.977 |      0.353 |            | 
##              |      0.846 |      0.047 |            | 
## -------------|------------|------------|------------|
##         Spam |         34 |        145 |        179 | 
##              |      0.190 |      0.810 |      0.107 | 
##              |      0.023 |      0.647 |            | 
##              |      0.020 |      0.087 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.95, while for predicting spam messages is 0.8. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.65. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is moderately high (0.19) as compared to the probability of a spam message being predicted as a legitimate message (0.05).

Accuracy for test data.

svm.accuracy.table <- as.data.frame(table(test_data$MessageLabel, SVMTest))
print(paste("Accuracy for SVM is:",
            100*round(((svm.accuracy.table$Freq[1]+svm.accuracy.table$Freq[4])/nrow(test_data)), 4),
            "%"))
## [1] "Accuracy for SVM is: 93.24 %"

Plot confusion matrix.

Reference_SVM <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_SVM <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_SVM <- c(1413, 34, 79, 145)
ConfusionMatrixPlot_SVM <- data.frame(Reference_SVM, Prediction_SVM, Y_SVM)

# Plot
ggplot(data =  ConfusionMatrixPlot_SVM, mapping = aes(x = Reference_SVM, y = Prediction_SVM)) +
     geom_tile(aes(fill = Y_SVM), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_SVM)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Logistic regression

SMS_GLM <- glm(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, family = "binomial")
GLMTest <- predict(SMS_GLM, test_data, type = 'response')

#Confusion Matrix
GLM_Matrix <- table(test_data$MessageLabel, GLMTest > 0.5)
GLM_Matrix
##             
##              FALSE TRUE
##   Legitimate  1415   32
##   Spam          82  142
summary(SMS_GLM)
## 
## Call:
## glm(formula = MessageLabel ~ free + winner + congratulation + 
##     adult + attention + ringtone, family = "binomial", data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2869  -0.1661  -0.1661  -0.1661   2.9294  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.2768     0.1354 -31.583  < 2e-16 ***
## freey             2.2517     0.2502   9.001  < 2e-16 ***
## winnery           2.5418     0.1995  12.739  < 2e-16 ***
## congratulationy   2.2650     0.9018   2.512  0.01202 *  
## adulty            1.1240     0.3447   3.261  0.00111 ** 
## attentiony        1.4571     0.1540   9.461  < 2e-16 ***
## ringtoney         3.4235     0.1476  23.188  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3074.4  on 3900  degrees of freedom
## Residual deviance: 1402.8  on 3894  degrees of freedom
## AIC: 1416.8
## 
## Number of Fisher Scoring iterations: 6

Analysing the summary for Logistic Regression train model, we can infer that: 1. Distribution of residuals is symmetrical. That is, that model can accurately predict points that are close to the actual observed points. 2. The model reveals that ‘congratulation’ and ‘adult’ are the most least important terms as their value of error is far greater than the value of error for Intercept.

Accuracy for test data.

#table(test_data$Label, Logistic_Regression_Test > 0.75)
glm.accuracy.table <- as.data.frame(table(test_data$MessageLabel, GLMTest > 0.75))
print(paste("Accuracy of Logistic Regression is:",
            100*round(((glm.accuracy.table$Freq[1]+glm.accuracy.table$Freq[4])/nrow(test_data)), 4),
            "%"))
## [1] "Accuracy of Logistic Regression is: 92.94 %"

ROCR Curve

library(ROCR)
Logistic_Regression_Prediction <- prediction(abs(GLMTest), test_data$MessageLabel)
Logistic_Regression_Performance <- performance(Logistic_Regression_Prediction,"tpr","fpr")
plot(Logistic_Regression_Performance, colorize = TRUE, text.adj = c(-0.2,1.7))

The ROCR curve substantiates the high accuracy of the model as the closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.

Naive Bayes Classifier

#Retain words which appear in 5 or more than 5 SMS messages.
Frequent_Terms = findFreqTerms(TDM_train_data, 5)
TDM_train_data_New = DocumentTermMatrix(Corpus_train_data, list(dictionary=Frequent_Terms))
TDM_test_data_New =  DocumentTermMatrix(Corpus_test_data, list(dictionary=Frequent_Terms))
#To write a function to convert numerics in TDms to factors of yes/no.
Convert_Numerics_To_Factors = function(num) 
  {
  num = ifelse(num > 0, 1, 0)
  num = factor(num, levels = c(0, 1), labels=c("No", "Yes"))
  return (num)
  }

#Apply above fucntion to the new TDM train and test datasets.
TDM_train_data_New = apply(TDM_train_data_New, MARGIN=2, Convert_Numerics_To_Factors)
TDM_test_data_New  = apply(TDM_test_data_New, MARGIN=2, Convert_Numerics_To_Factors)
SMS_NB = naiveBayes(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, laplace = 1)
SMS_NBTest = predict(SMS_NB, TDM_test_data_New)


library(gmodels)
CT <- CrossTable(SMS_NBTest, test_data$MessageLabel, 
           prop.chisq = FALSE, 
           dnn = c("Predicted", "Actual")) #Name of column
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Actual 
##    Predicted | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1447 |        221 |       1668 | 
##              |      0.868 |      0.132 |      0.998 | 
##              |      1.000 |      0.987 |            | 
##              |      0.866 |      0.132 |            | 
## -------------|------------|------------|------------|
##         Spam |          0 |          3 |          3 | 
##              |      0.000 |      1.000 |      0.002 | 
##              |      0.000 |      0.013 |            | 
##              |      0.000 |      0.002 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.87, while for predicting spam messages is 1.00. 2. Recall for predicting Legitimate messages is 1.00, while for predicting spam messages is 0.013. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is perfect (0.00) as compared to the probability of a spam message being predicted as a legitimate message (0.13).

nb.accuracy.table <- as.data.frame(table(test_data$MessageLabel, SMS_NBTest))
print(paste("Accuracy for NB is:",
             100*round(((nb.accuracy.table$Freq[1]+nb.accuracy.table$Freq[4])/nrow(test_data)), 4),
             "%"))
## [1] "Accuracy for NB is: 86.77 %"

Building models for all the features of Spam SMS.

Decision Tree Model

# Build a recursive partitioning decision tree.
SMS_Rpart_All <- rpart(formula = MessageLabel ~., data = Sparse_train_data, method = "class")

rpart.plot(SMS_Rpart_All, type = 4, fallen.leaves = FALSE, extra = 4)

This tree reveals that out of all these tokens, the most important token is ‘call’ and the least important ones being ‘mobile and stop’.

summary(SMS_Rpart_All)
## Call:
## rpart(formula = MessageLabel ~ ., data = Sparse_train_data, method = "class")
##   n= 3901 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.15487572      0 1.0000000 1.0000000 0.04069031
## 2  0.15296367      1 0.8451243 0.9158700 0.03919387
## 3  0.06883365      2 0.6921606 0.6921606 0.03465013
## 4  0.01912046      3 0.6233270 0.6252390 0.03309493
## 5  0.01816444      4 0.6042065 0.6386233 0.03341451
## 6  0.01720841      6 0.5678776 0.6290631 0.03318670
## 7  0.01529637      8 0.5334608 0.6118547 0.03277083
## 8  0.01434034      9 0.5181644 0.5736138 0.03181871
## 9  0.01338432     11 0.4894837 0.5621415 0.03152514
## 10 0.01000000     12 0.4760994 0.5143403 0.03025934
## 
## Variable importance
##       call        txt      claim       text      later      reply 
##         26         19          8          7          4          4 
##      prize      sorry       stop     urgent        ppm        won 
##          3          2          2          2          2          2 
##       draw        can       free     mobile    awarded      nokia 
##          2          2          1          1          1          1 
##       tone        yes      pobox guaranteed       send 
##          1          1          1          1          1 
## 
## Node number 1: 3901 observations,    complexity param=0.1548757
##   predicted class=Legitimate  expected loss=0.1340682  P(node) =1
##     class counts:  3378   523
##    probabilities: 0.866 0.134 
##   left son=2 (3524 obs) right son=3 (377 obs)
##   Primary splits:
##       call   < 0.5 to the left,  improve=187.02190, (0 missing)
##       txt    < 0.5 to the left,  improve=129.74050, (0 missing)
##       claim  < 0.5 to the left,  improve=127.17900, (0 missing)
##       free   < 0.5 to the left,  improve=114.55070, (0 missing)
##       mobile < 0.5 to the left,  improve= 95.84149, (0 missing)
##   Surrogate splits:
##       prize  < 0.5 to the left,  agree=0.914, adj=0.106, (0 split)
##       claim  < 0.5 to the left,  agree=0.912, adj=0.088, (0 split)
##       urgent < 0.5 to the left,  agree=0.912, adj=0.085, (0 split)
##       won    < 0.5 to the left,  agree=0.911, adj=0.082, (0 split)
##       ppm    < 0.5 to the left,  agree=0.911, adj=0.082, (0 split)
## 
## Node number 2: 3524 observations,    complexity param=0.1529637
##   predicted class=Legitimate  expected loss=0.08342792  P(node) =0.9033581
##     class counts:  3230   294
##    probabilities: 0.917 0.083 
##   left son=4 (3426 obs) right son=5 (98 obs)
##   Primary splits:
##       txt  < 0.5 to the left,  improve=137.13040, (0 missing)
##       free < 0.5 to the left,  improve= 80.41720, (0 missing)
##       stop < 0.5 to the left,  improve= 59.43363, (0 missing)
##       win  < 0.5 to the left,  improve= 55.76072, (0 missing)
##       text < 0.5 to the left,  improve= 51.42719, (0 missing)
##   Surrogate splits:
##       draw    < 0.5 to the left,  agree=0.974, adj=0.082, (0 split)
##       nokia   < 1.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       awarded < 0.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       tone    < 0.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       tcs     < 0.5 to the left,  agree=0.973, adj=0.020, (0 split)
## 
## Node number 3: 377 observations,    complexity param=0.06883365
##   predicted class=Spam        expected loss=0.3925729  P(node) =0.09664189
##     class counts:   148   229
##    probabilities: 0.393 0.607 
##   left son=6 (36 obs) right son=7 (341 obs)
##   Primary splits:
##       later  < 0.5 to the right, improve=29.37026, (0 missing)
##       sorry  < 0.5 to the right, improve=24.09157, (0 missing)
##       claim  < 0.5 to the left,  improve=21.12756, (0 missing)
##       prize  < 0.5 to the left,  improve=17.35938, (0 missing)
##       urgent < 0.5 to the left,  improve=14.56856, (0 missing)
##   Surrogate splits:
##       sorry   < 0.5 to the right, agree=0.960, adj=0.583, (0 split)
##       meeting < 0.5 to the right, agree=0.915, adj=0.111, (0 split)
## 
## Node number 4: 3426 observations,    complexity param=0.01912046
##   predicted class=Legitimate  expected loss=0.05983654  P(node) =0.8782363
##     class counts:  3221   205
##    probabilities: 0.940 0.060 
##   left son=8 (3334 obs) right son=9 (92 obs)
##   Primary splits:
##       text  < 0.5 to the left,  improve=46.23725, (0 missing)
##       free  < 0.5 to the left,  improve=41.09870, (0 missing)
##       reply < 0.5 to the left,  improve=37.11689, (0 missing)
##       stop  < 0.5 to the left,  improve=35.03827, (0 missing)
##       claim < 0.5 to the left,  improve=31.98873, (0 missing)
##   Surrogate splits:
##       free    < 2.5 to the left,  agree=0.974, adj=0.043, (0 split)
##       pobox   < 0.5 to the left,  agree=0.974, adj=0.033, (0 split)
##       message < 1.5 to the left,  agree=0.974, adj=0.022, (0 split)
##       video   < 0.5 to the left,  agree=0.973, adj=0.011, (0 split)
## 
## Node number 5: 98 observations
##   predicted class=Spam        expected loss=0.09183673  P(node) =0.02512176
##     class counts:     9    89
##    probabilities: 0.092 0.908 
## 
## Node number 6: 36 observations
##   predicted class=Legitimate  expected loss=0  P(node) =0.009228403
##     class counts:    36     0
##    probabilities: 1.000 0.000 
## 
## Node number 7: 341 observations,    complexity param=0.01720841
##   predicted class=Spam        expected loss=0.3284457  P(node) =0.08741348
##     class counts:   112   229
##    probabilities: 0.328 0.672 
##   left son=14 (283 obs) right son=15 (58 obs)
##   Primary splits:
##       claim  < 0.5 to the left,  improve=15.07833, (0 missing)
##       can    < 0.5 to the right, improve=13.87236, (0 missing)
##       prize  < 0.5 to the left,  improve=12.34596, (0 missing)
##       urgent < 0.5 to the left,  improve=10.33451, (0 missing)
##       won    < 0.5 to the left,  improve= 9.50100, (0 missing)
##   Surrogate splits:
##       guaranteed < 0.5 to the left,  agree=0.871, adj=0.241, (0 split)
##       prize      < 0.5 to the left,  agree=0.862, adj=0.190, (0 split)
##       draw       < 0.5 to the left,  agree=0.859, adj=0.172, (0 split)
##       hrs        < 0.5 to the left,  agree=0.859, adj=0.172, (0 split)
##       selected   < 0.5 to the left,  agree=0.856, adj=0.155, (0 split)
## 
## Node number 8: 3334 observations,    complexity param=0.01816444
##   predicted class=Legitimate  expected loss=0.04619076  P(node) =0.8546527
##     class counts:  3180   154
##    probabilities: 0.954 0.046 
##   left son=16 (3284 obs) right son=17 (50 obs)
##   Primary splits:
##       reply < 0.5 to the left,  improve=28.92908, (0 missing)
##       claim < 0.5 to the left,  improve=20.08080, (0 missing)
##       stop  < 0.5 to the left,  improve=19.10557, (0 missing)
##       free  < 0.5 to the left,  improve=18.22569, (0 missing)
##       win   < 0.5 to the left,  improve=16.54198, (0 missing)
##   Surrogate splits:
##       stop < 1.5 to the left,  agree=0.987, adj=0.12, (0 split)
##       end  < 1.5 to the left,  agree=0.986, adj=0.04, (0 split)
##       went < 2.5 to the left,  agree=0.986, adj=0.04, (0 split)
## 
## Node number 9: 92 observations,    complexity param=0.01434034
##   predicted class=Spam        expected loss=0.4456522  P(node) =0.0235837
##     class counts:    41    51
##    probabilities: 0.446 0.554 
##   left son=18 (70 obs) right son=19 (22 obs)
##   Primary splits:
##       free   < 0.5 to the left,  improve=5.531846, (0 missing)
##       mobile < 0.5 to the left,  improve=5.481522, (0 missing)
##       stop   < 0.5 to the left,  improve=4.625020, (0 missing)
##       text   < 1.5 to the left,  improve=4.456522, (0 missing)
##       yes    < 0.5 to the left,  improve=3.480331, (0 missing)
##   Surrogate splits:
##       fun     < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       word    < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       latest  < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       orange  < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       message < 1.5 to the left,  agree=0.783, adj=0.091, (0 split)
## 
## Node number 14: 283 observations,    complexity param=0.01720841
##   predicted class=Spam        expected loss=0.3957597  P(node) =0.0725455
##     class counts:   112   171
##    probabilities: 0.396 0.604 
##   left son=28 (26 obs) right son=29 (257 obs)
##   Primary splits:
##       can    < 0.5 to the right, improve=11.615610, (0 missing)
##       mobile < 0.5 to the left,  improve=10.372500, (0 missing)
##       urgent < 0.5 to the left,  improve= 8.968500, (0 missing)
##       ppm    < 0.5 to the left,  improve= 7.842131, (0 missing)
##       mins   < 0.5 to the left,  improve= 7.472429, (0 missing)
##   Surrogate splits:
##       dont < 0.5 to the right, agree=0.919, adj=0.115, (0 split)
##       come < 1.5 to the right, agree=0.919, adj=0.115, (0 split)
##       back < 0.5 to the right, agree=0.915, adj=0.077, (0 split)
##       help < 1.5 to the right, agree=0.915, adj=0.077, (0 split)
##       sure < 0.5 to the right, agree=0.915, adj=0.077, (0 split)
## 
## Node number 15: 58 observations
##   predicted class=Spam        expected loss=0  P(node) =0.01486798
##     class counts:     0    58
##    probabilities: 0.000 1.000 
## 
## Node number 16: 3284 observations,    complexity param=0.01816444
##   predicted class=Legitimate  expected loss=0.03806334  P(node) =0.8418354
##     class counts:  3159   125
##    probabilities: 0.962 0.038 
##   left son=32 (3273 obs) right son=33 (11 obs)
##   Primary splits:
##       claim  < 0.5 to the left,  improve=20.425500, (0 missing)
##       free   < 0.5 to the left,  improve=12.278720, (0 missing)
##       cash   < 0.5 to the left,  improve=11.181800, (0 missing)
##       send   < 0.5 to the left,  improve=10.543650, (0 missing)
##       mobile < 0.5 to the left,  improve= 9.452283, (0 missing)
##   Surrogate splits:
##       apply   < 0.5 to the left,  agree=0.997, adj=0.091, (0 split)
##       receive < 0.5 to the left,  agree=0.997, adj=0.091, (0 split)
## 
## Node number 17: 50 observations,    complexity param=0.01529637
##   predicted class=Spam        expected loss=0.42  P(node) =0.01281723
##     class counts:    21    29
##    probabilities: 0.420 0.580 
##   left son=34 (34 obs) right son=35 (16 obs)
##   Primary splits:
##       stop < 0.5 to the left,  improve=8.3011760, (0 missing)
##       send < 0.5 to the left,  improve=2.0944170, (0 missing)
##       yes  < 0.5 to the left,  improve=1.2503650, (0 missing)
##       free < 0.5 to the left,  improve=0.5504762, (0 missing)
##       now  < 0.5 to the left,  improve=0.2935548, (0 missing)
##   Surrogate splits:
##       send   < 0.5 to the left,  agree=0.82, adj=0.438, (0 split)
##       see    < 0.5 to the left,  agree=0.78, adj=0.312, (0 split)
##       friend < 0.5 to the left,  agree=0.78, adj=0.312, (0 split)
##       yes    < 0.5 to the left,  agree=0.74, adj=0.188, (0 split)
##       per    < 0.5 to the left,  agree=0.72, adj=0.125, (0 split)
## 
## Node number 18: 70 observations,    complexity param=0.01434034
##   predicted class=Legitimate  expected loss=0.4571429  P(node) =0.01794412
##     class counts:    38    32
##    probabilities: 0.543 0.457 
##   left son=36 (59 obs) right son=37 (11 obs)
##   Primary splits:
##       stop   < 0.5 to the left,  improve=5.331455, (0 missing)
##       mobile < 0.5 to the left,  improve=4.584127, (0 missing)
##       now    < 0.5 to the left,  improve=2.742857, (0 missing)
##       reply  < 0.5 to the left,  improve=2.488889, (0 missing)
##       new    < 0.5 to the left,  improve=2.488889, (0 missing)
##   Surrogate splits:
##       help < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       live < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       pls  < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       sms  < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       per  < 0.5 to the left,  agree=0.871, adj=0.182, (0 split)
## 
## Node number 19: 22 observations
##   predicted class=Spam        expected loss=0.1363636  P(node) =0.00563958
##     class counts:     3    19
##    probabilities: 0.136 0.864 
## 
## Node number 28: 26 observations
##   predicted class=Legitimate  expected loss=0.1538462  P(node) =0.006664958
##     class counts:    22     4
##    probabilities: 0.846 0.154 
## 
## Node number 29: 257 observations
##   predicted class=Spam        expected loss=0.3501946  P(node) =0.06588054
##     class counts:    90   167
##    probabilities: 0.350 0.650 
## 
## Node number 32: 3273 observations
##   predicted class=Legitimate  expected loss=0.03483043  P(node) =0.8390156
##     class counts:  3159   114
##    probabilities: 0.965 0.035 
## 
## Node number 33: 11 observations
##   predicted class=Spam        expected loss=0  P(node) =0.00281979
##     class counts:     0    11
##    probabilities: 0.000 1.000 
## 
## Node number 34: 34 observations
##   predicted class=Legitimate  expected loss=0.3823529  P(node) =0.008715714
##     class counts:    21    13
##    probabilities: 0.618 0.382 
## 
## Node number 35: 16 observations
##   predicted class=Spam        expected loss=0  P(node) =0.004101512
##     class counts:     0    16
##    probabilities: 0.000 1.000 
## 
## Node number 36: 59 observations,    complexity param=0.01338432
##   predicted class=Legitimate  expected loss=0.3728814  P(node) =0.01512433
##     class counts:    37    22
##    probabilities: 0.627 0.373 
##   left son=72 (52 obs) right son=73 (7 obs)
##   Primary splits:
##       mobile < 0.5 to the left,  improve=6.24706600, (0 missing)
##       now    < 0.5 to the left,  improve=3.48210900, (0 missing)
##       can    < 0.5 to the right, improve=0.12069290, (0 missing)
##       get    < 0.5 to the right, improve=0.12069290, (0 missing)
##       just   < 0.5 to the right, improve=0.03322034, (0 missing)
##   Surrogate splits:
##       claim  < 0.5 to the left,  agree=0.949, adj=0.571, (0 split)
##       yes    < 0.5 to the left,  agree=0.949, adj=0.571, (0 split)
##       today  < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
##       pobox  < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
##       chance < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
## 
## Node number 37: 11 observations
##   predicted class=Spam        expected loss=0.09090909  P(node) =0.00281979
##     class counts:     1    10
##    probabilities: 0.091 0.909 
## 
## Node number 72: 52 observations
##   predicted class=Legitimate  expected loss=0.2884615  P(node) =0.01332992
##     class counts:    37    15
##    probabilities: 0.712 0.288 
## 
## Node number 73: 7 observations
##   predicted class=Spam        expected loss=0  P(node) =0.001794412
##     class counts:     0     7
##    probabilities: 0.000 1.000

Randome Forest Classifier

Apply Random Forest to substantiate analysis of Decision Tree by plotting the importance of each token.

Sparse_train_data$MessageLabel %<>% as.factor()

#Applying the formula for Random Forest Algorithm
RFSpam_Tree_All <- randomForest(MessageLabel~., data = Sparse_train_data, ntree=25, proximity = T)

#To plot the Variable Importance Plot.
ImportancePlot <- varImpPlot(RFSpam_Tree_All, n.var=min(10, nrow(RFSpam_Tree_All$importance), main = "Importance of each Token"))

This plot also expresses that the most important token amongst all is ‘Call’.

# Importance of each token in a tabular form.
importance(RFSpam_Tree_All)
##            MeanDecreaseGini
## got            1.460661e+00
## great          8.270238e-01
## wat            1.601001e-01
## world          2.427161e-01
## lar            4.730884e-02
## apply          8.414157e+00
## free           4.351412e+01
## may            2.168562e-01
## receive        3.920573e+00
## text           1.842494e+01
## txt            7.315290e+01
## win            1.669104e+01
## already        3.532199e-01
## dun            6.003133e-02
## early          5.852846e-03
## say            3.325153e-02
## around         6.304604e-01
## think          6.262022e-01
## back           1.325551e+00
## fun            1.354714e+00
## hey            6.093561e-01
## like           7.336385e-01
## now            1.232298e+01
## send           6.049085e+00
## still          6.151698e-01
## word           1.464409e+00
## xxx            1.599048e+00
## even           2.200812e-01
## speak          9.985932e-01
## friends        2.322280e-01
## per            1.035186e+01
## call           5.700762e+01
## claim          5.054001e+01
## code           8.483541e+00
## customer       1.123862e+01
## network        2.846888e+00
## prize          2.805572e+01
## selected       3.180260e+00
## camera         3.485621e+00
## latest         3.110666e+00
## mobile         2.582238e+01
## enough         4.314412e-01
## gonna          4.014110e-01
## home           4.754806e-01
## soon           1.159409e-02
## stuff          2.224561e-02
## talk           7.815777e-01
## today          9.950108e-01
## tonight        4.603462e-01
## want           2.146138e+00
## cash           1.671871e+01
## cost           1.740806e+00
## days           1.150114e+00
## reply          1.849156e+01
## pobox          8.090096e+00
## urgent         8.879221e+00
## week           1.905177e+00
## won            1.117356e+01
## help           2.927631e+00
## right          3.731302e-01
## take           1.162102e+00
## thank          2.879093e-02
## will           3.187057e+00
## wont           1.538369e-01
## message        5.339677e+00
## next.          2.314284e+00
## use            1.038385e+00
## watching       8.368793e-03
## make           8.531552e-01
## name           5.312822e-01
## remember       4.836880e-01
## yes            9.638324e-01
## feel           1.204636e-01
## fine           5.782081e-03
## way            3.134745e-01
## dont           8.597595e-01
## miss           8.972639e-01
## going          8.172059e-01
## try            2.429739e-01
## first          5.367418e-01
## finish         5.881126e-01
## lor            2.679433e-01
## lunch          7.466951e-02
## can            4.039166e+00
## meet           2.282454e-01
## eat            4.161589e-02
## getting        6.472145e-01
## just           2.184637e+00
## lol            6.982793e-02
## really         2.297502e-02
## always         1.903866e-01
## bus            5.660931e-01
## dinner         2.420317e-02
## left           5.943667e-01
## love           1.290512e+00
## amp            9.707547e-01
## car            3.418773e-01
## know           1.528872e+00
## let            3.537807e-01
## room           7.858332e-04
## work           8.286549e-01
## live           6.731008e-01
## sure           8.727385e-01
## wait           6.774022e-01
## yeah           1.014626e-01
## anything       2.509478e-01
## tell           1.008272e+00
## month          5.843476e-01
## please         7.193894e+00
## thanks         9.535681e-01
## look           2.011986e-02
## msg            1.259507e+00
## yup            2.639240e-01
## done           1.352728e-01
## see            2.045939e+00
## hello          3.830804e-01
## trying         1.753783e-02
## pls            1.730337e+00
## weekend        8.518930e-01
## need           7.586405e-01
## sweet          2.107094e-02
## nokia          9.528240e+00
## sms            4.942906e+00
## tomorrow       3.535814e-01
## hope           7.246227e-01
## ltgt           2.925237e+00
## man            1.909288e-01
## well           3.180198e-01
## get            3.549518e+00
## ask            2.970662e-01
## bit            1.983690e-01
## maybe          6.698431e-04
## class          2.649139e-01
## time           1.766635e+00
## half           4.471857e-01
## morning        3.606567e-01
## place          1.502104e+00
## best           2.440202e-01
## give           8.533695e-01
## happy          9.663209e-02
## never          8.075516e-03
## sorry          7.435694e-01
## thought        3.674745e-01
## end            1.940042e+00
## new            7.762053e+00
## play           1.261553e+00
## find           1.774222e+00
## special        9.231898e-01
## year           3.657119e-01
## later          5.031632e+00
## meeting        2.557768e-02
## pick           2.512068e-01
## good           8.445903e-01
## part           4.098702e-01
## come           5.178405e-01
## check          1.484829e-01
## nice           2.910839e-02
## said           2.181560e-01
## awarded        8.110195e+00
## day            3.206504e+00
## hear           6.027832e-01
## money          5.365772e-01
## babe           3.309264e-01
## something      2.985565e-01
## wanna          8.123198e-01
## waiting        2.164357e+00
## cool           1.322648e-01
## thats          1.213763e-01
## much           6.839115e-01
## job            7.216955e-03
## looking        2.254024e+00
## stop           2.554950e+01
## one            1.514529e+00
## real           6.639699e-01
## bed            5.694816e-02
## another        2.595906e-03
## late           1.117946e+00
## night          8.321182e-01
## smile          8.849571e-02
## someone        7.657738e-01
## guaranteed     5.718541e+00
## service        1.673185e+01
## buy            2.872218e-01
## forgot         1.750938e-01
## nothing        1.607000e-02
## long           2.252564e-01
## yet            5.591431e-01
## guess          5.283703e-01
## dear           6.255323e-01
## life           4.710878e-01
## lot            1.581190e-01
## birthday       1.891482e-03
## aight          3.728384e-01
## better         4.458362e-01
## people         7.611604e-01
## cos            3.620207e-02
## things         4.633226e-01
## contact        1.294374e+01
## draw           3.213374e+00
## hrs            2.846567e-01
## last           6.415213e-02
## ppm            3.952001e+00
## shows          5.024618e+00
## went           1.270022e-01
## holiday        5.056378e+00
## account        2.321032e+00
## landline       4.399035e+00
## todays         4.447568e-01
## sent           4.082293e-01
## girl           4.144558e-01
## chat           1.669660e+01
## sir            4.036159e-01
## gud            2.037410e-02
## little         2.926519e-01
## luv            3.788218e-01
## thk            1.382940e-01
## house          6.553694e-02
## keep           6.769511e-01
## friend         6.334060e-01
## also           7.450538e-01
## liao           3.299030e-03
## coming         2.059025e-01
## cant           9.332197e-01
## ill            2.128583e-02
## offer          1.312581e+00
## guys           6.933979e-01
## working        1.329603e-01
## haha           2.207775e-02
## jus            6.233425e-02
## every          1.836474e+00
## dat            5.761043e-02
## big            2.297049e-03
## ready          6.991695e-01
## leh            1.203402e-01
## easy           4.880632e-01
## called         4.818370e-01
## nite           1.758755e-01
## start          6.118643e-01
## reach          1.312030e-01
## person         1.040787e-01
## everything     5.223904e-01
## thanx          1.320583e-02
## told           5.616981e-02
## watch          2.438757e-01
## asked          5.426730e-01
## didnt          1.250654e-01
## sleep          9.178553e-02
## min            4.693691e-01
## care           7.321561e-01
## mins           4.820231e+00
## video          4.895026e+00
## shopping       1.683557e-01
## plan           1.549709e-02
## box            5.319198e+00
## might          3.066640e-01
## baby           6.080120e-02
## hour           7.404992e-02
## phone          2.590955e+00
## shit           6.268706e-02
## dunno          1.090167e-01
## problem        6.423522e-01
## line           2.504573e+00
## number         8.564100e-01
## chance         2.963449e+00
## two            1.094674e-01
## ever           2.582146e-01
## minutes        1.221336e-01
## orange         8.158265e+00
## wish           4.684970e-01
## quite          4.296295e-01
## leave          6.789973e-01
## sat            3.098569e-01
## actually       2.985013e-01
## put            8.051255e-02
## god            2.761475e-01
## tone           9.595441e+00
## thing          2.321083e-01
## den            6.011938e-04
## heart          1.699929e-01
## mind           3.487982e-01
## bad            1.103589e+00
## tcs            5.932818e+00
## enjoy          1.102578e+00
## princess       6.707518e-02
## many           4.693232e-01
## shall          1.128283e-01
## kiss           5.212154e-02
## probably       2.098691e-01
## dad            6.407980e-03
## wan            2.524935e-01

Test the above Random Forest Model on test data and check the accuracy, precision, recall and F1.

Sparse_test_data$MessageLabel %<>% as.factor()

RFTest_All <- predict(RFSpam_Tree_All, newdata =Sparse_test_data)

# Confusion Matrix
RFTest_Matrix_All <- confusionMatrix(predict(RFSpam_Tree_All, newdata =Sparse_test_data), Sparse_test_data$MessageLabel)
RFTest_Matrix_All
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1438   54
##   Spam                9  170
##                                          
##                Accuracy : 0.9623         
##                  95% CI : (0.952, 0.9709)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8225         
##  Mcnemar's Test P-Value : 2.965e-08      
##                                          
##             Sensitivity : 0.9938         
##             Specificity : 0.7589         
##          Pos Pred Value : 0.9638         
##          Neg Pred Value : 0.9497         
##              Prevalence : 0.8659         
##          Detection Rate : 0.8606         
##    Detection Prevalence : 0.8929         
##       Balanced Accuracy : 0.8764         
##                                          
##        'Positive' Class : Legitimate     
## 
# CrossTable
CrossTable(RFTest_All, Sparse_test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Sparse_test_data$MessageLabel 
##   RFTest_All | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1438 |         54 |       1492 | 
##              |      0.964 |      0.036 |      0.893 | 
##              |      0.994 |      0.241 |            | 
##              |      0.861 |      0.032 |            | 
## -------------|------------|------------|------------|
##         Spam |          9 |        170 |        179 | 
##              |      0.050 |      0.950 |      0.107 | 
##              |      0.006 |      0.759 |            | 
##              |      0.005 |      0.102 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.97, while for predicting spam messages is 0.94. 2. Recall for predicting Legitimate messages is 0.99, while for predicting spam messages is 0.78. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite less (0.03) as compared to the probability of a spam message being predicted as a legitimate message (0.03).

Accuracy for test Data.

TestPredictability_All <- sum(RFTest_All == Sparse_test_data$MessageLabel)/ length(Sparse_test_data$MessageLabel)*100

message("Predcitability Percentage for Test Data is:")
## Predcitability Percentage for Test Data is:
print(TestPredictability_All)
## [1] 96.2298

Plot Confusion Matrix

Reference_RF_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_RF_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_All <- c(1440, 7, 49, 175)
ConfusionMatrixPlot_All <- data.frame(Reference_RF_All, Prediction_RF_All, Y_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_All, mapping = aes(x = Reference_RF_All, y = Prediction_RF_All)) +
     geom_tile(aes(fill = Y_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Support Vector Machine

SMS_SVM_All <- svm(MessageLabel ~., data = Sparse_train_data, kernel = "linear", cost = 0.1, gamma = 0.1)
SVMTest_All <- predict(SMS_SVM_All, Sparse_test_data)

# Confusion Matrix
SVM_Measure_All <- confusionMatrix(predict(SMS_SVM_All, newdata = Sparse_test_data), Sparse_test_data$MessageLabel)

# CrossTable
CrossTable(SVMTest_All, Sparse_test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Sparse_test_data$MessageLabel 
##  SVMTest_All | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1412 |         28 |       1440 | 
##              |      0.981 |      0.019 |      0.862 | 
##              |      0.976 |      0.125 |            | 
##              |      0.845 |      0.017 |            | 
## -------------|------------|------------|------------|
##         Spam |         35 |        196 |        231 | 
##              |      0.152 |      0.848 |      0.138 | 
##              |      0.024 |      0.875 |            | 
##              |      0.021 |      0.117 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.85. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.88. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite high (0.15) as compared to the probability of a spam message being predicted as a legitimate message (0.02).

Accuracy for test data.

svm.accuracy.table_All <- as.data.frame(table(Sparse_test_data$MessageLabel, SVMTest_All))
print(paste("Accuracy for SVM is:",
            100*round(((svm.accuracy.table_All$Freq[1]+svm.accuracy.table_All$Freq[4])/nrow(Sparse_test_data)), 4),
            "%"))
## [1] "Accuracy for SVM is: 96.23 %"

Plot Confusion Matrix

Reference_SVM_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_SVM_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_SVM_All <- c(1412, 35, 28, 196)
ConfusionMatrixPlot_SVM_All <- data.frame(Reference_SVM_All, Prediction_SVM_All, Y_SVM_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_SVM_All, mapping = aes(x = Reference_SVM_All, y = Prediction_SVM_All)) +
     geom_tile(aes(fill = Y_SVM_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_SVM_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Logistic Regression

SMS_GLM_All <- glm(MessageLabel ~., data = Sparse_train_data, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
GLMTest_All <- predict(SMS_GLM_All, Sparse_test_data, type = 'response')

#Confusion Matrix
GLM_Matrix_All <- table(Sparse_test_data$MessageLabel, GLMTest_All > 0.5)
GLM_Matrix_All
##             
##              FALSE TRUE
##   Legitimate  1415   32
##   Spam          38  186
summary(SMS_GLM_All)
## 
## Call:
## glm(formula = MessageLabel ~ ., family = "binomial", data = Sparse_train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1361  -0.0302   0.0000   0.0000   3.4952  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.106e+00  5.331e-01 -11.454  < 2e-16 ***
## got         -4.543e-01  1.471e+00  -0.309 0.757462    
## great       -1.311e-02  1.287e+00  -0.010 0.991875    
## wat         -1.720e+01  9.612e+03  -0.002 0.998572    
## world       -1.782e+01  1.545e+04  -0.001 0.999080    
## lar         -1.964e+01  1.569e+04  -0.001 0.999001    
## apply        4.504e+01  1.679e+04   0.003 0.997859    
## free         3.371e+00  8.598e-01   3.921 8.83e-05 ***
## may         -1.816e+01  1.750e+04  -0.001 0.999172    
## receive      8.233e-01  1.973e+00   0.417 0.676544    
## text         3.917e+00  9.687e-01   4.044 5.26e-05 ***
## txt          5.045e+01  3.976e+03   0.013 0.989878    
## win          5.004e+00  1.777e+00   2.816 0.004869 ** 
## already      1.828e+00  1.475e+00   1.239 0.215359    
## dun         -1.394e+01  1.197e+04  -0.001 0.999071    
## early       -2.213e+01  1.838e+04  -0.001 0.999039    
## say         -1.835e+01  9.569e+03  -0.002 0.998470    
## around       4.860e+00  1.575e+00   3.085 0.002035 ** 
## think       -2.431e+01  7.463e+03  -0.003 0.997401    
## back         1.234e-02  1.293e+00   0.010 0.992383    
## fun          2.441e+00  1.680e+00   1.453 0.146228    
## hey         -3.432e+00  2.552e+00  -1.345 0.178667    
## like         7.103e-01  1.197e+00   0.594 0.552774    
## now          2.226e+00  6.327e-01   3.519 0.000434 ***
## send         3.579e+00  9.774e-01   3.662 0.000250 ***
## still       -4.671e+00  2.937e+00  -1.590 0.111761    
## word         1.948e-01  1.844e+00   0.106 0.915887    
## xxx          6.951e+00  2.159e+00   3.219 0.001285 ** 
## even        -1.021e+00  3.217e+00  -0.317 0.750957    
## speak       -6.358e+01  8.812e+03  -0.007 0.994243    
## friends     -6.491e+00  5.818e+00  -1.116 0.264588    
## per          7.656e+00  1.934e+00   3.958 7.57e-05 ***
## call         2.175e+00  5.507e-01   3.950 7.80e-05 ***
## claim        9.191e+01  8.925e+03   0.010 0.991784    
## code         2.199e+01  2.041e+04   0.001 0.999140    
## customer     2.073e+00  1.557e+00   1.331 0.183079    
## network      7.530e+00  4.224e+00   1.783 0.074636 .  
## prize        2.936e+01  7.418e+03   0.004 0.996842    
## selected    -1.025e+00  1.121e+01  -0.091 0.927149    
## camera      -1.642e+00  3.924e+01  -0.042 0.966616    
## latest       4.061e+00  3.894e+01   0.104 0.916953    
## mobile       3.784e+00  8.384e-01   4.513 6.39e-06 ***
## enough      -2.068e+01  1.892e+04  -0.001 0.999128    
## gonna       -6.736e+01  1.259e+04  -0.005 0.995731    
## home        -8.667e+00  5.050e+00  -1.716 0.086128 .  
## soon        -1.654e+01  7.496e+03  -0.002 0.998239    
## stuff        4.467e-01  1.692e+00   0.264 0.791728    
## talk         2.794e+00  1.380e+00   2.025 0.042828 *  
## today       -1.372e+00  1.750e+00  -0.784 0.433190    
## tonight     -1.198e-01  5.571e+00  -0.022 0.982837    
## want         1.082e+00  6.602e-01   1.639 0.101227    
## cash         2.583e+00  1.393e+00   1.854 0.063698 .  
## cost         7.456e+00  2.334e+00   3.195 0.001397 ** 
## days         2.581e+00  1.928e+00   1.339 0.180587    
## reply        5.211e+00  1.291e+00   4.036 5.43e-05 ***
## pobox        9.561e+01  1.373e+04   0.007 0.994442    
## urgent       2.470e+00  2.252e+00   1.097 0.272729    
## week        -1.639e+00  1.561e+00  -1.050 0.293718    
## won          3.166e+01  1.079e+04   0.003 0.997659    
## help         6.225e+00  1.155e+00   5.390 7.05e-08 ***
## right       -7.274e+00  9.509e+00  -0.765 0.444302    
## take         1.377e-01  1.819e+00   0.076 0.939652    
## thank       -3.519e+00  2.593e+01  -0.136 0.892070    
## will         9.335e-01  7.987e-01   1.169 0.242519    
## wont        -2.368e+01  1.478e+04  -0.002 0.998721    
## message      1.240e+00  1.088e+00   1.140 0.254277    
## next.        3.664e+00  1.358e+00   2.697 0.006994 ** 
## use          2.786e+00  1.365e+00   2.041 0.041285 *  
## watching    -1.749e+01  1.819e+04  -0.001 0.999233    
## make         7.918e-02  4.635e+00   0.017 0.986370    
## name         6.671e-01  4.062e+00   0.164 0.869560    
## remember    -1.192e+02  1.925e+04  -0.006 0.995060    
## yes          3.682e-01  1.945e+00   0.189 0.849855    
## feel        -1.855e+01  1.011e+04  -0.002 0.998536    
## fine        -1.783e+01  1.756e+04  -0.001 0.999190    
## way         -2.596e+01  7.944e+03  -0.003 0.997392    
## dont         3.431e-01  1.122e+00   0.306 0.759766    
## miss         1.146e+00  1.893e+00   0.606 0.544765    
## going       -2.716e+01  6.505e+03  -0.004 0.996668    
## try         -4.359e+00  7.030e+00  -0.620 0.535254    
## first       -9.562e-01  2.110e+00  -0.453 0.650353    
## finish      -7.645e+01  1.662e+04  -0.005 0.996329    
## lor         -2.246e+01  1.260e+04  -0.002 0.998577    
## lunch       -1.715e+01  1.465e+04  -0.001 0.999066    
## can         -1.098e+00  8.960e-01  -1.226 0.220222    
## meet        -7.232e-01  3.236e+00  -0.223 0.823159    
## eat         -1.672e+01  1.478e+04  -0.001 0.999097    
## getting      3.478e+00  1.422e+00   2.446 0.014439 *  
## just        -1.406e+00  1.087e+00  -1.293 0.195952    
## lol         -2.015e+01  1.301e+04  -0.002 0.998765    
## really      -2.059e+01  9.936e+03  -0.002 0.998347    
## always      -3.268e+01  1.099e+04  -0.003 0.997628    
## bus         -1.864e+01  1.346e+04  -0.001 0.998895    
## dinner      -1.990e+01  1.602e+04  -0.001 0.999009    
## left         2.642e+00  2.013e+00   1.312 0.189432    
## love        -6.550e-01  2.277e+00  -0.288 0.773560    
## amp         -1.560e+01  8.706e+03  -0.002 0.998570    
## car         -1.660e+01  1.177e+04  -0.001 0.998874    
## know        -3.808e+00  1.947e+00  -1.956 0.050452 .  
## let         -4.275e+01  2.160e+04  -0.002 0.998421    
## room        -1.679e+01  1.526e+04  -0.001 0.999122    
## work        -1.592e+01  2.604e+03  -0.006 0.995122    
## live         6.913e-01  8.378e+00   0.083 0.934235    
## sure        -1.964e+01  1.329e+04  -0.001 0.998821    
## wait        -1.918e+01  6.353e+03  -0.003 0.997591    
## yeah        -2.306e+01  1.486e+04  -0.002 0.998762    
## anything    -2.763e+01  1.310e+04  -0.002 0.998317    
## tell        -3.038e+00  1.751e+00  -1.735 0.082715 .  
## month       -6.109e+00  3.261e+00  -1.873 0.061037 .  
## please       1.915e+00  9.694e-01   1.976 0.048152 *  
## thanks       5.366e-01  1.383e+00   0.388 0.698084    
## look         2.559e+00  2.757e+00   0.928 0.353302    
## msg          3.949e+00  1.399e+00   2.822 0.004775 ** 
## yup         -2.376e+01  1.642e+04  -0.001 0.998846    
## done        -1.733e+01  1.616e+04  -0.001 0.999144    
## see         -1.300e-02  1.330e+00  -0.010 0.992202    
## hello        4.308e+00  1.410e+00   3.054 0.002255 ** 
## trying      -1.615e+01  4.955e+04   0.000 0.999740    
## pls         -2.257e+00  1.728e+00  -1.306 0.191561    
## weekend     -9.776e-01  1.902e+00  -0.514 0.607278    
## need        -3.990e+00  2.358e+00  -1.692 0.090658 .  
## sweet       -2.316e+01  1.437e+04  -0.002 0.998715    
## nokia       -2.602e+00  1.262e+01  -0.206 0.836656    
## sms          3.329e+00  1.396e+00   2.385 0.017099 *  
## tomorrow     1.804e+00  1.911e+00   0.944 0.345156    
## hope         8.973e-02  2.740e+00   0.033 0.973880    
## ltgt        -3.289e+01  3.928e+03  -0.008 0.993318    
## man         -2.038e+01  1.427e+04  -0.001 0.998860    
## well        -2.398e+01  9.446e+03  -0.003 0.997975    
## get          4.705e-01  7.914e-01   0.595 0.552165    
## ask         -1.774e+01  9.901e+03  -0.002 0.998571    
## bit         -7.084e+01  1.307e+04  -0.005 0.995674    
## maybe       -1.479e+01  1.810e+04  -0.001 0.999348    
## class       -3.378e+01  4.196e+03  -0.008 0.993577    
## time         1.085e+00  1.183e+00   0.917 0.359216    
## half        -5.133e+00  6.764e+01  -0.076 0.939507    
## morning     -2.837e+01  8.726e+03  -0.003 0.997406    
## place        3.968e-02  1.578e+00   0.025 0.979943    
## best         2.210e+00  2.133e+00   1.036 0.300118    
## give        -5.659e-01  1.715e+00  -0.330 0.741458    
## happy       -7.980e+00  1.165e+01  -0.685 0.493404    
## never        7.241e-01  1.479e+00   0.490 0.624333    
## sorry       -9.739e-01  1.298e+00  -0.750 0.453053    
## thought     -2.165e+01  1.807e+04  -0.001 0.999044    
## end          1.404e+00  5.677e+00   0.247 0.804705    
## new          3.937e+00  9.170e-01   4.293 1.76e-05 ***
## play         9.409e-01  2.450e+00   0.384 0.700914    
## find         6.067e+00  2.237e+00   2.712 0.006693 ** 
## special      4.168e+00  1.817e+00   2.294 0.021801 *  
## year        -1.987e+01  1.808e+04  -0.001 0.999123    
## later       -2.767e+01  1.369e+04  -0.002 0.998387    
## meeting     -3.650e+01  1.477e+04  -0.002 0.998028    
## pick        -2.914e-01  1.736e+00  -0.168 0.866696    
## good         4.275e-01  1.182e+00   0.362 0.717475    
## part         3.195e+00  1.906e+00   1.676 0.093655 .  
## come        -1.513e+00  1.522e+00  -0.995 0.319926    
## check        1.616e+00  1.443e+00   1.120 0.262676    
## nice        -1.842e+01  1.172e+04  -0.002 0.998746    
## said        -2.075e+01  7.676e+03  -0.003 0.997843    
## awarded      1.309e+01  1.026e+04   0.001 0.998982    
## day          2.168e+00  8.910e-01   2.433 0.014966 *  
## hear         2.582e+00  1.960e+00   1.317 0.187690    
## money        3.064e+00  1.786e+00   1.715 0.086347 .  
## babe         4.156e+00  1.203e+00   3.454 0.000552 ***
## something   -1.995e+01  1.174e+04  -0.002 0.998645    
## wanna       -4.300e+00  3.763e+00  -1.143 0.253161    
## waiting      1.914e+00  1.440e+00   1.329 0.183828    
## cool        -1.503e+00  1.804e+00  -0.833 0.404731    
## thats       -2.782e+01  1.400e+04  -0.002 0.998415    
## much        -5.297e+00  2.528e+00  -2.095 0.036158 *  
## job         -2.124e+01  1.146e+04  -0.002 0.998520    
## looking      4.744e+00  5.366e+00   0.884 0.376609    
## stop         5.784e+00  1.775e+00   3.259 0.001118 ** 
## one          9.137e-01  1.243e+00   0.735 0.462169    
## real        -1.356e+00  5.764e+00  -0.235 0.814043    
## bed         -2.168e+01  1.741e+04  -0.001 0.999006    
## another     -1.211e+01  4.878e+03  -0.002 0.998019    
## late         1.948e+00  1.325e+00   1.470 0.141491    
## night       -2.294e+00  2.022e+00  -1.135 0.256417    
## smile       -1.187e+01  9.307e+03  -0.001 0.998983    
## someone     -7.129e+00  3.003e+00  -2.374 0.017594 *  
## guaranteed   2.356e+01  1.124e+04   0.002 0.998328    
## service      4.559e+00  2.236e+00   2.039 0.041485 *  
## buy          2.343e+00  1.717e+00   1.365 0.172347    
## forgot      -1.438e+01  1.419e+04  -0.001 0.999192    
## nothing     -3.641e+00  1.577e+01  -0.231 0.817483    
## long        -1.820e+01  1.581e+04  -0.001 0.999082    
## yet          2.280e-01  2.416e+00   0.094 0.924816    
## guess        2.934e+00  1.958e+00   1.499 0.133985    
## dear         1.241e+00  1.081e+00   1.148 0.250810    
## life         2.572e-01  2.356e+00   0.109 0.913047    
## lot         -1.977e+01  1.654e+04  -0.001 0.999046    
## birthday    -1.494e+01  1.762e+04  -0.001 0.999323    
## aight       -1.836e+01  2.088e+04  -0.001 0.999298    
## better      -1.689e+01  1.123e+04  -0.002 0.998800    
## people       1.848e+00  2.901e+00   0.637 0.524044    
## cos         -1.808e+01  1.289e+04  -0.001 0.998881    
## things       1.622e-01  2.685e+00   0.060 0.951823    
## contact      4.770e+00  2.382e+00   2.002 0.045289 *  
## draw         5.639e+00  1.571e+00   3.589 0.000332 ***
##  [ reached getOption("max.print") -- omitted 91 rows ]
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3074.36  on 3900  degrees of freedom
## Residual deviance:  283.73  on 3610  degrees of freedom
## AIC: 865.73
## 
## Number of Fisher Scoring iterations: 23

Analysing the summary for Logistic Regression train model, we can infer that: 1. Distribution of residuals is not so symmetrical. That is, that model is also predicting points far away from the actual observed points. 2. The model reveals that ‘call’ is the most important terms as its value of error is same as the value of error for Intercept.

Accuracy for test data.

glm.accuracy.table.All <- as.data.frame(table(Sparse_test_data$MessageLabel, GLMTest_All > 0.75))
print(paste("Accuracy of Logistic Regression is:",
            100*round(((glm.accuracy.table.All$Freq[1]+glm.accuracy.table.All$Freq[4])/nrow(Sparse_test_data)), 4),
            "%"))
## [1] "Accuracy of Logistic Regression is: 96.17 %"

ROCR Curve

library(ROCR)
Logistic_Regression_Prediction_All <- prediction(abs(GLMTest_All), Sparse_test_data$MessageLabel)
Logistic_Regression_Performance_All <- performance(Logistic_Regression_Prediction_All,"tpr","fpr")
plot(Logistic_Regression_Performance_All, colorize = TRUE, text.adj = c(-0.2,1.7))

The ROCR curve substantiates the high accuracy of the model as the closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.

Naive Bayes Model

SMS_NB_All = naiveBayes(MessageLabel ~. , data = Sparse_train_data, laplace = 1)
SMS_NBTest_All = predict(SMS_NB_All, Sparse_test_data) 


library(gmodels)
CT <- CrossTable(SMS_NBTest_All, Sparse_test_data$MessageLabel, 
           prop.chisq = FALSE, 
           prop.t = FALSE, 
           dnn = c("Predicted", "Actual")) #Name of column
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Actual 
##    Predicted | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |        127 |          2 |        129 | 
##              |      0.984 |      0.016 |      0.077 | 
##              |      0.088 |      0.009 |            | 
## -------------|------------|------------|------------|
##         Spam |       1320 |        222 |       1542 | 
##              |      0.856 |      0.144 |      0.923 | 
##              |      0.912 |      0.991 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.144. 2. Recall for predicting Legitimate messages is 0.08, while for predicting spam messages is 0.99. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite high (0.86) as compared to the probability of a spam message being predicted as a legitimate message (0.02).

Accuracy for test data.

nb.accuracy.table.all <- as.data.frame(table(Sparse_test_data$MessageLabel, SMS_NBTest_All))
print(paste("Accuracy for NB is:",
             100*round(((nb.accuracy.table.all$Freq[1]+nb.accuracy.table.all$Freq[4])/nrow(Sparse_test_data)), 4),
             "%"))
## [1] "Accuracy for NB is: 20.89 %"